NYC Building Violation Analysis EDAV Project

Author

Shreya Shetty (svs2148) & Shruti Shetty (ss7592)

library(socratadata)
library(dplyr)
library(lubridate)
library(janitor)
library(readr)
library(ggplot2)
library(ggalluvial)
housing_filtered <- read.csv("/Users/shreyashetty/Documents/Fall 2025 Courses/EDAV/Final_Project/Housing_Maintenance_Code_Violations_20251125.csv")

cat(nrow(housing_filtered))
143112
sr311_filtered <- read.csv("/Users/shreyashetty/Documents/Fall 2025 Courses/EDAV/Final_Project/311_Service_Requests_from_2010_to_Present_20251125.csv")

cat(nrow(sr311_filtered))
701761
# view(housing_filtered)
# view(sr311_filtered)
head(housing_filtered)
  ViolationID BuildingID RegistrationID BoroID   Borough HouseNumber
1    17442503      11076         100711      1 MANHATTAN         142
2    17576658      27937         108549      1 MANHATTAN         121
3    17964149      19366         107245      1 MANHATTAN         171
4    17601375      42010         105105      1 MANHATTAN         202
5    17601376      42010         105105      1 MANHATTAN         202
6    17390227      42340         125158      1 MANHATTAN         450
  LowHouseNumber HighHouseNumber         StreetName StreetCode Postcode
1            142             148      EAST 7 STREET      17130    10009
2            121             121 ST NICHOLAS AVENUE      31190    10026
3            171             171    EAST 111 STREET      19190    10029
4            202             216    WEST 146 STREET      36690    10039
5            202             216    WEST 146 STREET      36690    10039
6            450             450    WEST 149 STREET      36750    10031
  Apartment Story Block Lot Class InspectionDate ApprovedDate
1        5A     1   402  15     C     11/25/2024   11/25/2024
2        2B     2  1922  24     B     12/19/2024   12/20/2024
3        1E     1  1639  29     B     06/05/2025   06/11/2025
4        4L     4  2031  38     B     01/06/2025   01/06/2025
5        4L     4  2031  38     B     01/06/2025   01/06/2025
6         7     1  2063  50     B     11/01/2024   11/01/2024
  OriginalCertifyByDate OriginalCorrectByDate NewCertifyByDate NewCorrectByDate
1            12/15/2024            12/10/2024                                  
2            02/10/2025            01/27/2025                                  
3            07/31/2025            07/17/2025                                  
4            02/25/2025            02/11/2025                                  
5            02/25/2025            02/11/2025                                  
6            12/23/2024            12/09/2024                                  
  CertifiedDate OrderNumber   NOVID
1    12/13/2024         530 9458247
2    07/30/2025        1503 9549552
3                       583 9847172
4                       529 9567076
5                       502 9567076
6    11/12/2024        1503 9410632
                                                                                                                                                                                                                                         NOVDescription
1 § 27-2005, 27-2007, 27-2041.1 HMC, §238, § 309; § 107 (2) ( C) MDL AND 28 RCNY §25-171: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE LATCHSET IN THE ENTRANCE LOCATED AT APT 5A, 1st STORY, 2nd APARTMENT FROM EAST AT SOUTH
2                                                                          § 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). DEFECTIVE IN THE ENTIRE APARTMENT LOCATED AT APT 2B, 2nd STORY, 2nd APARTMENT FROM NORTH AT EAST
3                                                                      § 27-2026, 2027 HMC: PROPERLY REPAIR THE SOURCE AND ABATE THE EVIDENCE OF A WATER LEAK AT CEILING IN THE BATHROOM LOCATED AT APT 1E, 1st STORY, 1st APARTMENT FROM NORTH AT EAST
4                                                                                                       § 27-2005 HMC: REFIT\032 1ST.WINDOW FROM WEST AT NORTH IN THE 2nd ROOM FROM EAST LOCATED AT APT 4L, 4th STORY, 3rd APARTMENT FROM WEST AT NORTH
5                                                          § 27-2005 ADM CODE PROPERLY REPAIR WITH SIMILAR MATERIAL THE BROKEN OR DEFECTIVE SILICONE CAULKING AT BATHTUB IN THE BATHROOM LOCATED AT APT 4L, 4th STORY, 3rd APARTMENT FROM WEST AT NORTH
6                                                                             § 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 1st STORY, 2nd APARTMENT FROM SOUTH AT WEST
  NOVIssuedDate CurrentStatusID       CurrentStatus CurrentStatusDate  NovType
1    11/26/2024               9 VIOLATION DISMISSED        02/24/2025 Original
2    12/23/2024               4  NOV CERTIFIED LATE        07/30/2025 Original
3    06/12/2025               2        NOV SENT OUT        06/12/2025 Original
4    01/07/2025               2        NOV SENT OUT        01/07/2025 Original
5    01/07/2025               2        NOV SENT OUT        01/07/2025 Original
6    11/04/2024              19    VIOLATION CLOSED        11/22/2024 Original
  ViolationStatus RentImpairing Latitude Longitude CommunityBoard
1           Close             N 40.72576 -73.98287              3
2            Open             N 40.80370 -73.95271             10
3            Open             N 40.79517 -73.94330             11
4            Open             N 40.82245 -73.93894             10
5            Open             N 40.82245 -73.93894             10
6           Close             N 40.82737 -73.94479              9
  CouncilDistrict CensusTract     BIN        BBL                         NTA
1               2          32 1005084 1004020015                East Village
2               9         218 1058377 1019220024              Harlem (South)
3               8         172 1052203 1016390029         East Harlem (South)
4               9         232 1060487 1020310038              Harlem (North)
5               9         232 1060487 1020310038              Harlem (North)
6               7         231 1061525 1020630050 Hamilton Heights-Sugar Hill
colnames(housing_filtered)
 [1] "ViolationID"           "BuildingID"            "RegistrationID"       
 [4] "BoroID"                "Borough"               "HouseNumber"          
 [7] "LowHouseNumber"        "HighHouseNumber"       "StreetName"           
[10] "StreetCode"            "Postcode"              "Apartment"            
[13] "Story"                 "Block"                 "Lot"                  
[16] "Class"                 "InspectionDate"        "ApprovedDate"         
[19] "OriginalCertifyByDate" "OriginalCorrectByDate" "NewCertifyByDate"     
[22] "NewCorrectByDate"      "CertifiedDate"         "OrderNumber"          
[25] "NOVID"                 "NOVDescription"        "NOVIssuedDate"        
[28] "CurrentStatusID"       "CurrentStatus"         "CurrentStatusDate"    
[31] "NovType"               "ViolationStatus"       "RentImpairing"        
[34] "Latitude"              "Longitude"             "CommunityBoard"       
[37] "CouncilDistrict"       "CensusTract"           "BIN"                  
[40] "BBL"                   "NTA"                  
head(sr311_filtered)
  Unique.Key           Created.Date            Closed.Date Agency
1   66674892 10/31/2025 11:59:47 PM 11/01/2025 12:05:10 AM   DSNY
2   66675637 10/31/2025 11:59:32 PM 11/01/2025 01:12:16 AM   NYPD
3   66685926 10/31/2025 11:59:10 PM 11/01/2025 04:52:29 AM   NYPD
4   66677254 10/31/2025 11:58:27 PM 11/01/2025 12:35:08 AM   NYPD
5   66675675 10/31/2025 11:58:20 PM 10/31/2025 11:59:23 PM   NYPD
6   66685266 10/31/2025 11:57:28 PM 11/01/2025 12:46:31 AM   NYPD
                      Agency.Name          Complaint.Type       Descriptor
1        Department of Sanitation      Vendor Enforcement      Food Vendor
2 New York City Police Department     Noise - Residential     Loud Talking
3 New York City Police Department     Noise - Residential Loud Music/Party
4 New York City Police Department     Noise - Residential Loud Music/Party
5 New York City Police Department         Noise - Vehicle  Car/Truck Music
6 New York City Police Department Noise - Street/Sidewalk Loud Music/Party
               Location.Type Incident.Zip     Incident.Address      Street.Name
1                     Street        10002     9 DELANCY STREET   DELANCY STREET
2 Residential Building/House        10025 249 WEST  109 STREET WEST  109 STREET
3 Residential Building/House        10004      25 BROAD STREET     BROAD STREET
4 Residential Building/House        10025 550 WEST  114 STREET WEST  114 STREET
5            Street/Sidewalk        10009    87 ST MARKS PLACE   ST MARKS PLACE
6            Street/Sidewalk        10026 140 WEST  113 STREET WEST  113 STREET
      Cross.Street.1                   Cross.Street.2 Intersection.Street.1
1             BOWERY                  CHRYSTIE STREET                BOWERY
2   AMSTERDAM AVENUE                         BROADWAY      AMSTERDAM AVENUE
3     EXCHANGE PLACE                    BEAVER STREET        EXCHANGE PLACE
4   AMSTERDAM AVENUE                         BROADWAY      AMSTERDAM AVENUE
5           1 AVENUE                         AVENUE A              1 AVENUE
6 ST NICHOLAS AVENUE ADAM CLAYTON POWELL JR BOULEVARD    ST NICHOLAS AVENUE
             Intersection.Street.2 Address.Type     City         Landmark
1                  CHRYSTIE STREET      ADDRESS NEW YORK  DELANCEY STREET
2                         BROADWAY      ADDRESS NEW YORK WEST  109 STREET
3                    BEAVER STREET      ADDRESS NEW YORK     BROAD STREET
4                         BROADWAY      ADDRESS NEW YORK WEST  114 STREET
5                         AVENUE A      ADDRESS NEW YORK   ST MARKS PLACE
6 ADAM CLAYTON POWELL JR BOULEVARD      ADDRESS NEW YORK WEST  113 STREET
  Facility.Type Status Due.Date
1               Closed         
2               Closed         
3               Closed         
4               Closed         
5               Closed         
6               Closed         
                                                                                                                   Resolution.Description
1                                                                                                                                     N/A
2 The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.
3                                   The Police Department responded to the complaint and determined that police action was not necessary.
4                                                  The Police Department responded to the complaint and took action to fix the condition.
5                                                  The Police Department responded to the complaint and took action to fix the condition.
6 The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.
  Resolution.Action.Updated.Date Community.Board        BBL   Borough
1         11/01/2025 12:05:13 AM    03 MANHATTAN 1004240026 MANHATTAN
2         11/01/2025 01:12:19 AM    07 MANHATTAN 1018810008 MANHATTAN
3         11/01/2025 04:52:35 AM    01 MANHATTAN 1000257502 MANHATTAN
4         11/01/2025 12:35:12 AM    09 MANHATTAN 1018850156 MANHATTAN
5         10/31/2025 11:59:26 PM    03 MANHATTAN 1004360057 MANHATTAN
6         11/01/2025 12:46:34 AM    10 MANHATTAN 1018220057 MANHATTAN
  X.Coordinate..State.Plane. Y.Coordinate..State.Plane. Open.Data.Channel.Type
1                    986,083                    201,633                  PHONE
2                    993,702                    231,848                  PHONE
3                    981,128                    196,533                 ONLINE
4                    994,254                    233,029                 MOBILE
5                    988,427                    204,307                  PHONE
6                    997,281                    231,054                 ONLINE
  Park.Facility.Name Park.Borough Vehicle.Type Taxi.Company.Borough
1        Unspecified    MANHATTAN                                  
2        Unspecified    MANHATTAN                                  
3        Unspecified    MANHATTAN                                  
4        Unspecified    MANHATTAN                                  
5        Unspecified    MANHATTAN          Car                     
6        Unspecified    MANHATTAN                                  
  Taxi.Pick.Up.Location Bridge.Highway.Name Bridge.Highway.Direction Road.Ramp
1                                                                             
2                                                                             
3                                                                             
4                                                                             
5                                                                             
6                                                                             
  Bridge.Highway.Segment Latitude Longitude
1                        40.72011 -73.99339
2                        40.80304 -73.96586
3                        40.70611 -74.01126
4                        40.80628 -73.96386
5                        40.72745 -73.98493
6                        40.80086 -73.95293
                                  Location
1  (40.72011254060361, -73.99338737853634)
2  (40.80304007929047, -73.96585904688254)
3 (40.706113895406446, -74.01126037586627)
4   (40.80628099138207, -73.9638634441027)
5   (40.72745122642037, -73.9849296444645)
6  (40.80085621357295, -73.95293311739029)
colnames(sr311_filtered)
 [1] "Unique.Key"                     "Created.Date"                  
 [3] "Closed.Date"                    "Agency"                        
 [5] "Agency.Name"                    "Complaint.Type"                
 [7] "Descriptor"                     "Location.Type"                 
 [9] "Incident.Zip"                   "Incident.Address"              
[11] "Street.Name"                    "Cross.Street.1"                
[13] "Cross.Street.2"                 "Intersection.Street.1"         
[15] "Intersection.Street.2"          "Address.Type"                  
[17] "City"                           "Landmark"                      
[19] "Facility.Type"                  "Status"                        
[21] "Due.Date"                       "Resolution.Description"        
[23] "Resolution.Action.Updated.Date" "Community.Board"               
[25] "BBL"                            "Borough"                       
[27] "X.Coordinate..State.Plane."     "Y.Coordinate..State.Plane."    
[29] "Open.Data.Channel.Type"         "Park.Facility.Name"            
[31] "Park.Borough"                   "Vehicle.Type"                  
[33] "Taxi.Company.Borough"           "Taxi.Pick.Up.Location"         
[35] "Bridge.Highway.Name"            "Bridge.Highway.Direction"      
[37] "Road.Ramp"                      "Bridge.Highway.Segment"        
[39] "Latitude"                       "Longitude"                     
[41] "Location"                      
# ==== Libraries ====
library(dplyr)
library(lubridate)

housing <- housing_filtered |>
  mutate(
    # standardize all date columns
    inspectiondate = parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    approveddate = parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    originalcertifybydate = parse_date_time(OriginalCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    originalcorrectbydate = parse_date_time(OriginalCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    newcertifybydate = parse_date_time(NewCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    newcorrectbydate = parse_date_time(NewCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    certifieddate = parse_date_time(CertifiedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    novissueddate = parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    currentstatusdate = parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    borough_std = toupper(Borough)
  ) |>
  # subset only Manhattan & date range
  filter(
    borough_std == "MANHATTAN",
    inspectiondate >= as.Date("2024-11-01") & inspectiondate <= as.Date("2025-11-01")
  )

# check
cat("Housing Violations Data Rows after filtering:", nrow(housing), "\n")
Housing Violations Data Rows after filtering: 143112 
library(dplyr)
library(lubridate)

sr311 <- sr311_filtered |>
  mutate(
    # standardize date columns
    created_date = parse_date_time(Created.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    closed_date = parse_date_time(Closed.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    borough_std = toupper(Borough)
  ) |>
  # subset only Manhattan & last 1 year
  filter(
    borough_std == "MANHATTAN",
    created_date >= as.Date("2024-11-01") & created_date <= as.Date("2025-11-01")
  )

# check
cat("311 Service Requests Data Rows after filtering:", nrow(sr311), "\n")
311 Service Requests Data Rows after filtering: 701761 
# housing <- housing_filtered
# sr311 <- sr311_filtered

cat("Housing Violations rows:", nrow(housing), "\n")
Housing Violations rows: 143112 
cat("311 Service Requests rows:", nrow(sr311), "\n")
311 Service Requests rows: 701761 
colnames(housing)
 [1] "ViolationID"           "BuildingID"            "RegistrationID"       
 [4] "BoroID"                "Borough"               "HouseNumber"          
 [7] "LowHouseNumber"        "HighHouseNumber"       "StreetName"           
[10] "StreetCode"            "Postcode"              "Apartment"            
[13] "Story"                 "Block"                 "Lot"                  
[16] "Class"                 "InspectionDate"        "ApprovedDate"         
[19] "OriginalCertifyByDate" "OriginalCorrectByDate" "NewCertifyByDate"     
[22] "NewCorrectByDate"      "CertifiedDate"         "OrderNumber"          
[25] "NOVID"                 "NOVDescription"        "NOVIssuedDate"        
[28] "CurrentStatusID"       "CurrentStatus"         "CurrentStatusDate"    
[31] "NovType"               "ViolationStatus"       "RentImpairing"        
[34] "Latitude"              "Longitude"             "CommunityBoard"       
[37] "CouncilDistrict"       "CensusTract"           "BIN"                  
[40] "BBL"                   "NTA"                   "inspectiondate"       
[43] "approveddate"          "originalcertifybydate" "originalcorrectbydate"
[46] "newcertifybydate"      "newcorrectbydate"      "certifieddate"        
[49] "novissueddate"         "currentstatusdate"     "borough_std"          
colnames(sr311)
 [1] "Unique.Key"                     "Created.Date"                  
 [3] "Closed.Date"                    "Agency"                        
 [5] "Agency.Name"                    "Complaint.Type"                
 [7] "Descriptor"                     "Location.Type"                 
 [9] "Incident.Zip"                   "Incident.Address"              
[11] "Street.Name"                    "Cross.Street.1"                
[13] "Cross.Street.2"                 "Intersection.Street.1"         
[15] "Intersection.Street.2"          "Address.Type"                  
[17] "City"                           "Landmark"                      
[19] "Facility.Type"                  "Status"                        
[21] "Due.Date"                       "Resolution.Description"        
[23] "Resolution.Action.Updated.Date" "Community.Board"               
[25] "BBL"                            "Borough"                       
[27] "X.Coordinate..State.Plane."     "Y.Coordinate..State.Plane."    
[29] "Open.Data.Channel.Type"         "Park.Facility.Name"            
[31] "Park.Borough"                   "Vehicle.Type"                  
[33] "Taxi.Company.Borough"           "Taxi.Pick.Up.Location"         
[35] "Bridge.Highway.Name"            "Bridge.Highway.Direction"      
[37] "Road.Ramp"                      "Bridge.Highway.Segment"        
[39] "Latitude"                       "Longitude"                     
[41] "Location"                       "created_date"                  
[43] "closed_date"                    "borough_std"                   
library(dplyr)
library(ggplot2)
library(ggalluvial)
library(lubridate)
library(tidyr)
library(scales)

# ---------- 1) Ensure Housing Dates are Date class ----------
housing <- housing_filtered |>
  mutate(
    # inspectiondate = as.Date(parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    # approveddate = as.Date(parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    # novissueddate = as.Date(parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    # currentstatusdate = as.Date(parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")))
    # standardize all date columns
    inspectiondate = parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    approveddate = parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    originalcertifybydate = parse_date_time(OriginalCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    originalcorrectbydate = parse_date_time(OriginalCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    newcertifybydate = parse_date_time(NewCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    newcorrectbydate = parse_date_time(NewCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    certifieddate = parse_date_time(CertifiedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    novissueddate = parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    currentstatusdate = parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    # borough_std = toupper(Borough)
    
  )

# ---------- 2) Ensure 311 Dates are Date class ----------
sr311 <- sr311_filtered |>
  mutate(
    created_date = as.Date(parse_date_time(Created.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    closed_date = as.Date(parse_date_time(Closed.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")))
  )

# ---------- 3) Housing Violations Timeline ----------
housing |>
  filter(!is.na(inspectiondate)) |>
  count(inspectiondate) |>
  ggplot(aes(x = inspectiondate, y = n)) +
  geom_line(color = "red") +
  labs(title = "Housing Violations Timeline (Manhattan, Last Year)", x = "Inspection Date", y = "Count") +
  theme_minimal()

# ---------- 4) 311 Complaints Timeline ----------
sr311 |>
  filter(!is.na(created_date)) |>
  count(created_date) |>
  ggplot(aes(x = created_date, y = n)) +
  geom_line(color = "orange") +
  labs(title = "311 Complaints Timeline (Manhattan, Last Year)", x = "Created Date", y = "Count") +
  theme_minimal()

# ---------- 5) Monthly aggregated Housing vs 311 ----------
housing_monthly <- housing |>
  filter(!is.na(inspectiondate)) |>
  mutate(month = floor_date(inspectiondate, "month")) |>
  group_by(month) |>
  summarise(HousingCount = n(), .groups = "drop")

sr311_monthly <- sr311 |>
  filter(!is.na(created_date)) |>
  mutate(month = floor_date(created_date, "month")) |>
  group_by(month) |>
  summarise(ComplaintCount = n(), .groups = "drop")

monthly_combined <- housing_monthly |>
  left_join(sr311_monthly, by = "month") |>
  pivot_longer(cols = c(HousingCount, ComplaintCount), names_to = "Source", values_to = "Count")

monthly_combined |>
  ggplot(aes(axis1 = month, axis2 = Source, y = Count)) +
  geom_alluvium(aes(fill = Source), width = 0.25) +
  geom_stratum(width = 0.25) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  theme_minimal() +
  labs(title = "Monthly Counts: Housing Violations vs 311 Complaints", y = "Count")

library(dplyr)
library(ggplot2)
library(ggalluvial)
library(lubridate)
library(scales)
library(tidyr)


# ---------- 3) Quick row counts ----------
cat("Housing Violations (Manhattan, 1yr):", nrow(housing_filtered), "rows\n")
Housing Violations (Manhattan, 1yr): 143112 rows
cat("311 Service Requests (Manhattan, 1yr):", nrow(sr311_filtered), "rows\n")
311 Service Requests (Manhattan, 1yr): 701761 rows
cat("Housing Violations rows:", nrow(housing), "\n")
Housing Violations rows: 143112 
cat("311 Service Requests rows:", nrow(sr311), "\n")
311 Service Requests rows: 701761 
# ---------- 4) Alluvial plot: Housing Class -> Current Status ----------
housing_filtered |>
  filter(!is.na(Class), !is.na(CurrentStatusID)) |>
  count(Class, CurrentStatusID) |>
  ggplot(aes(axis1 = Class, axis2 = CurrentStatusID, y = n)) +
  geom_alluvium(aes(fill = Class), width = 0.25) +
  geom_stratum(width = 0.25) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  theme_minimal() +
  labs(title = "Housing Violations: Class → Current Status", y = "Count")

# ---------- 5) 311 Complaint Type Counts ----------
sr311_filtered |>
  filter(!is.na(Complaint.Type)) |>
  count(Complaint.Type) |>
  slice_max(n, n = 15) |>
  ggplot(aes(x = reorder(Complaint.Type, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 15 311 Complaint Types (Manhattan, Last Year)", x = "", y = "Count") +
  theme_minimal()

# ---------- 6) Housing Violations by Street ----------
housing_filtered |>
  filter(!is.na(StreetName)) |>
  count(StreetName) |>
  slice_max(n, n = 20) |>
  ggplot(aes(x = reorder(StreetName, n), y = n)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 20 Manhattan Streets by Housing Violations (Last Year)", x = "", y = "Count") +
  theme_minimal()

# ---------- 7) 311 Complaints by Zip ----------
sr311_filtered |>
  filter(!is.na(Incident.Zip)) |>
  count(Incident.Zip) |>
  slice_max(n, n = 20) |>
  ggplot(aes(x = reorder(Incident.Zip, n), y = n)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(title = "Top 20 ZIP Codes by 311 Complaints (Manhattan, Last Year)", x = "", y = "Count") +
  theme_minimal()

library(dplyr)
library(ggplot2)
library(ggalluvial)
library(lubridate)
library(tidyr)
library(scales)

# ---------- 1) Standardize Housing Dates ----------
housing <- housing_filtered |>
  mutate(
    # inspectiondate = as.Date(parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    # approveddate = as.Date(parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    # novissueddate = as.Date(parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    # currentstatusdate = as.Date(parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    # borough_std = toupper(Borough)
    inspectiondate = parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    approveddate = parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    originalcertifybydate = parse_date_time(OriginalCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    originalcorrectbydate = parse_date_time(OriginalCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    newcertifybydate = parse_date_time(NewCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    newcorrectbydate = parse_date_time(NewCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    certifieddate = parse_date_time(CertifiedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    novissueddate = parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    currentstatusdate = parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
    borough_std = toupper(Borough)
  ) |>
  filter(borough_std == "MANHATTAN", inspectiondate >= as.Date("2024-11-01") & inspectiondate <= as.Date("2025-11-01"))

# ---------- 2) Standardize 311 Dates ----------
sr311 <- sr311_filtered |>
  mutate(
    created_date = as.Date(parse_date_time(Created.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    closed_date = as.Date(parse_date_time(Closed.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
    borough_std = toupper(Borough)
  ) |>
  filter(borough_std == "MANHATTAN", created_date >= as.Date("2024-11-01") & created_date <= as.Date("2025-11-01"))

# ---------- 3) Quick Row Counts ----------
cat("Housing Violations Rows:", nrow(housing), "\n")
Housing Violations Rows: 143112 
cat("311 Service Requests Rows:", nrow(sr311), "\n")
311 Service Requests Rows: 701761 
# ---------- 4) Alluvial: Housing Class -> Current Status ----------
housing |>
  filter(!is.na(Class), !is.na(CurrentStatusID)) |>
  count(Class, CurrentStatusID) |>
  ggplot(aes(axis1 = Class, axis2 = CurrentStatusID, y = n)) +
  geom_alluvium(aes(fill = Class), width = 0.25) +
  geom_stratum(width = 0.25) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  theme_minimal() +
  labs(title = "Housing Violations: Class → Current Status", y = "Count")

# ---------- 5) Top 311 Complaint Types ----------
sr311 |>
  filter(!is.na(Complaint.Type)) |>
  count(Complaint.Type) |>
  slice_max(n, n = 15) |>
  ggplot(aes(x = reorder(Complaint.Type, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 15 311 Complaint Types (Manhattan, Last Year)", x = "", y = "Count") +
  theme_minimal()

# ---------- 6) Housing Violations by Street ----------
housing |>
  filter(!is.na(StreetName)) |>
  count(StreetName) |>
  slice_max(n, n = 20) |>
  ggplot(aes(x = reorder(StreetName, n), y = n)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 20 Manhattan Streets by Housing Violations", x = "", y = "Count") +
  theme_minimal()

# ---------- 7) 311 Complaints by ZIP ----------
sr311 |>
  filter(!is.na(Incident.Zip)) |>
  count(Incident.Zip) |>
  slice_max(n, n = 20) |>
  ggplot(aes(x = reorder(Incident.Zip, n), y = n)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(title = "Top 20 ZIP Codes by 311 Complaints", x = "", y = "Count") +
  theme_minimal()

# ---------- 8) Housing Violations Timeline ----------
housing |>
  filter(!is.na(inspectiondate)) |>
  count(inspectiondate) |>
  ggplot(aes(x = inspectiondate, y = n)) +
  geom_line(color = "red") +
  labs(title = "Housing Violations Timeline", x = "Inspection Date", y = "Count") +
  theme_minimal()

# ---------- 9) 311 Complaints Timeline ----------
sr311 |>
  filter(!is.na(created_date)) |>
  count(created_date) |>
  ggplot(aes(x = created_date, y = n)) +
  geom_line(color = "orange") +
  labs(title = "311 Complaints Timeline", x = "Created Date", y = "Count") +
  theme_minimal()

# ---------- 10) Monthly Aggregated Housing vs 311 ----------
housing_monthly <- housing |>
  filter(!is.na(inspectiondate)) |>
  mutate(month = floor_date(inspectiondate, "month")) |>
  group_by(month) |>
  summarise(HousingCount = n(), .groups = "drop")

sr311_monthly <- sr311 |>
  filter(!is.na(created_date)) |>
  mutate(month = floor_date(created_date, "month")) |>
  group_by(month) |>
  summarise(ComplaintCount = n(), .groups = "drop")

monthly_combined <- housing_monthly |>
  left_join(sr311_monthly, by = "month") |>
  pivot_longer(cols = c(HousingCount, ComplaintCount), names_to = "Source", values_to = "Count")

monthly_combined |>
  ggplot(aes(axis1 = month, axis2 = Source, y = Count)) +
  geom_alluvium(aes(fill = Source), width = 0.25) +
  geom_stratum(width = 0.25) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  theme_minimal() +
  labs(title = "Monthly Counts: Housing Violations vs 311 Complaints", y = "Count")

# ---------- 11) Alluvial: Housing Class -> NOV Type ----------
housing |>
  filter(!is.na(Class), !is.na(NovType)) |>
  count(Class, NovType) |>
  ggplot(aes(axis1 = Class, axis2 = NovType, y = n)) +
  geom_alluvium(aes(fill = Class), width = 0.25) +
  geom_stratum(width = 0.25) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  theme_minimal() +
  labs(title = "Housing Violations: Class → NOV Type", y = "Count")

# ---------- 12) Top Housing Violation Classes ----------
housing |>
  filter(!is.na(Class)) |>
  count(Class) |>
  slice_max(n, n = 15) |>
  ggplot(aes(x = reorder(Class, n), y = n)) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(title = "Top 15 Housing Violation Classes (Manhattan, Last Year)", x = "", y = "Count") +
  theme_minimal()

# library(dplyr)
# library(ggplot2)
# library(ggalluvial)

# # Only keep rows where at least inspectiondate exists
# viol_comparison <- housing |>
#   select(ViolationID, inspectiondate, approveddate, certifieddate, currentstatusdate) |>
#   filter(!is.na(inspectiondate)) |>
#   pivot_longer(
#     cols = c(inspectiondate, approveddate, certifieddate, currentstatusdate),
#     names_to = "Stage",
#     values_to = "Date"
#   ) |>
#   filter(!is.na(Date)) |>
#   mutate(Stage = factor(Stage,
#                         levels = c("inspectiondate", "approveddate", "certifieddate", "currentstatusdate"),
#                         labels = c("Inspection", "Approved", "Certified", "Final Status")
#   ))
# 
# # Check the first few rows
# head(viol_comparison)

# # Alluvial plot
# ggplot(viol_comparison,
#        aes(axis1 = Stage, axis2 = Date, y = 1)) +
#   geom_alluvium(aes(fill = Stage), width = 0.2, alpha = 0.7) +
#   geom_stratum(width = 0.2) +
#   geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5) +
#   theme_minimal() +
#   labs(
#     title = "Housing Violations: Flow Across Key Dates (Manhattan, Last Year)",
#     x = "Stage",
#     y = "Count"
#   )

Code with last 3 years Data:

library(data.table)

# ========== LOAD FINAL DATASETS ==========
cat("========== LOADING FINAL DATASETS ==========\n\n")
========== LOADING FINAL DATASETS ==========
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw  <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")

# ========== HOUSING VIOLATIONS ==========
cat("===== DATASET 1: HOUSING VIOLATIONS (2022+) =====\n\n")
===== DATASET 1: HOUSING VIOLATIONS (2022+) =====
cat("Dimensions:\n")
Dimensions:
cat("Rows:", nrow(housing_raw), "\n")
Rows: 589005 
cat("Columns:", ncol(housing_raw), "\n\n")
Columns: 41 
cat("Column names:\n")
Column names:
print(names(housing_raw))
 [1] "ViolationID"           "BuildingID"            "RegistrationID"       
 [4] "BoroID"                "Borough"               "HouseNumber"          
 [7] "LowHouseNumber"        "HighHouseNumber"       "StreetName"           
[10] "StreetCode"            "Postcode"              "Apartment"            
[13] "Story"                 "Block"                 "Lot"                  
[16] "Class"                 "InspectionDate"        "ApprovedDate"         
[19] "OriginalCertifyByDate" "OriginalCorrectByDate" "NewCertifyByDate"     
[22] "NewCorrectByDate"      "CertifiedDate"         "OrderNumber"          
[25] "NOVID"                 "NOVDescription"        "NOVIssuedDate"        
[28] "CurrentStatusID"       "CurrentStatus"         "CurrentStatusDate"    
[31] "NovType"               "ViolationStatus"       "RentImpairing"        
[34] "Latitude"              "Longitude"             "CommunityBoard"       
[37] "CouncilDistrict"       "CensusTract"           "BIN"                  
[40] "BBL"                   "NTA"                  
cat("\n")
cat("Data types:\n")
Data types:
print(sapply(housing_raw, class))
$ViolationID
[1] "integer"

$BuildingID
[1] "integer"

$RegistrationID
[1] "integer"

$BoroID
[1] "integer"

$Borough
[1] "character"

$HouseNumber
[1] "character"

$LowHouseNumber
[1] "character"

$HighHouseNumber
[1] "character"

$StreetName
[1] "character"

$StreetCode
[1] "integer"

$Postcode
[1] "integer"

$Apartment
[1] "character"

$Story
[1] "integer"

$Block
[1] "integer"

$Lot
[1] "character"

$Class
[1] "character"

$InspectionDate
[1] "IDate" "Date" 

$ApprovedDate
[1] "character"

$OriginalCertifyByDate
[1] "character"

$OriginalCorrectByDate
[1] "character"

$NewCertifyByDate
[1] "character"

$NewCorrectByDate
[1] "character"

$CertifiedDate
[1] "character"

$OrderNumber
[1] "integer"

$NOVID
[1] "integer"

$NOVDescription
[1] "character"

$NOVIssuedDate
[1] "character"

$CurrentStatusID
[1] "integer"

$CurrentStatus
[1] "character"

$CurrentStatusDate
[1] "character"

$NovType
[1] "character"

$ViolationStatus
[1] "character"

$RentImpairing
[1] "character"

$Latitude
[1] "numeric"

$Longitude
[1] "numeric"

$CommunityBoard
[1] "integer"

$CouncilDistrict
[1] "integer"

$CensusTract
[1] "integer"

$BIN
[1] "integer"

$BBL
[1] "integer"

$NTA
[1] "character"
cat("\n")
cat("First 5 records:\n")
First 5 records:
print(head(housing_raw, 5))
   ViolationID BuildingID RegistrationID BoroID   Borough HouseNumber
         <int>      <int>          <int>  <int>    <char>      <char>
1:    15004789     805012         100420      1 MANHATTAN     144REAR
2:    14817075       9458         104092      1 MANHATTAN           1
3:    14831546      40441         118126      1 MANHATTAN         158
4:    15313781      18016         120064      1 MANHATTAN         314
5:    14838558       8157         118733      1 MANHATTAN        3885
   LowHouseNumber HighHouseNumber         StreetName StreetCode Postcode
           <char>          <char>             <char>      <int>    <int>
1:        144rear        146 REAR      LUDLOW STREET      25090    10002
2:              1               7 CHRISTOPHER STREET      15210    10014
3:            158             158    WEST 132 STREET      36410    10027
4:            314             314     EAST 89 STREET      18770    10128
5:           3885            3897           BROADWAY      13610    10032
   Apartment Story Block    Lot  Class InspectionDate ApprovedDate
      <char> <int> <int> <char> <char>         <IDat>       <char>
1:              NA   411     45      A     2022-03-08   03/08/2022
2:        9C     9   610     60      A     2022-02-10   02/10/2022
3:        3N     3  1916     55      B     2022-02-11   02/11/2022
4:               0  1551     43      C     2022-08-16   08/17/2022
5:       20B     2  2137     80      B     2022-02-14   02/15/2022
   OriginalCertifyByDate OriginalCorrectByDate NewCertifyByDate
                  <char>                <char>           <char>
1:            06/25/2022            06/11/2022                 
2:            05/31/2022            05/17/2022                 
3:            04/04/2022            03/21/2022                 
4:            09/06/2022            09/01/2022                 
5:            04/06/2022            03/23/2022                 
   NewCorrectByDate CertifiedDate OrderNumber   NOVID
             <char>        <char>       <int>   <int>
1:                                       1507 7793231
2:                                        554 7636476
3:                                        579 7638268
4:                     12/05/2022         530 8007840
5:                     03/18/2022         579 7651705
                                                                                                                                                                                          NOVDescription
                                                                                                                                                                                                  <char>
1: (A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS.
2:                           § 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH
3:                                                § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST
4:                                                                                  § 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD
5:                                             § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST
   NOVIssuedDate CurrentStatusID       CurrentStatus CurrentStatusDate  NovType
          <char>           <int>              <char>            <char>   <char>
1:    03/08/2022               9 VIOLATION DISMISSED        03/23/2022 Original
2:    02/11/2022              19    VIOLATION CLOSED        03/06/2024 Original
3:    02/14/2022               2        NOV SENT OUT        02/14/2022 Original
4:    08/18/2022              19    VIOLATION CLOSED        06/29/2023 Original
5:    02/16/2022               9 VIOLATION DISMISSED        05/31/2022 Original
   ViolationStatus RentImpairing Latitude Longitude CommunityBoard
            <char>        <char>    <num>     <num>          <int>
1:           Close             N       NA        NA             NA
2:           Close             N 40.73407 -73.99976              2
3:            Open             N 40.81285 -73.94384             10
4:           Close             N 40.77955 -73.94975              8
5:           Close             N 40.83741 -73.94244             12
   CouncilDistrict CensusTract     BIN        BBL                        NTA
             <int>       <int>   <int>      <int>                     <char>
1:              NA          NA      NA         NA                           
2:               3          71 1010692 1006100060               West Village
3:               9         226 1058117 1019160055             Harlem (North)
4:               5       14602 1050093 1015510043  Upper East Side-Yorkville
5:               7         245 1063363 1021370080 Washington Heights (South)
cat("\n")
cat("Summary statistics:\n")
Summary statistics:
print(summary(housing_raw[, .(ViolationID, BuildingID, InspectionDate, 
                                 CurrentStatus, ViolationStatus, RentImpairing,
                                 Latitude, Longitude)]))
  ViolationID         BuildingID      InspectionDate       CurrentStatus     
 Min.   :14753195   Min.   :      1   Min.   :2022-01-01   Length:589005     
 1st Qu.:15642804   1st Qu.:  13158   1st Qu.:2022-12-28   Class :character  
 Median :16530701   Median :  27948   Median :2023-12-13   Mode  :character  
 Mean   :16557793   Mean   : 101240   Mean   :2023-12-13                     
 3rd Qu.:17447307   3rd Qu.:  41552   3rd Qu.:2024-11-22                     
 Max.   :18445551   Max.   :1017519   Max.   :2025-11-23                     
                                                                             
 ViolationStatus    RentImpairing         Latitude       Longitude     
 Length:589005      Length:589005      Min.   :40.70   Min.   :-74.02  
 Class :character   Class :character   1st Qu.:40.78   1st Qu.:-73.97  
 Mode  :character   Mode  :character   Median :40.81   Median :-73.95  
                                       Mean   :40.80   Mean   :-73.95  
                                       3rd Qu.:40.83   3rd Qu.:-73.94  
                                       Max.   :40.88   Max.   :-73.91  
                                       NA's   :77      NA's   :77      
cat("\n")
# ========== 311 HOUSING COMPLAINTS ==========
cat("\n===== DATASET 2: 311 HOUSING COMPLAINTS (2022+) =====\n\n")

===== DATASET 2: 311 HOUSING COMPLAINTS (2022+) =====
cat("Dimensions:\n")
Dimensions:
cat("Rows:", nrow(sr311_raw ), "\n")
Rows: 425248 
cat("Columns:", ncol(sr311_raw ), "\n\n")
Columns: 42 
cat("Column names:\n")
Column names:
print(names(sr311_raw ))
 [1] "Unique Key"                     "Created Date"                  
 [3] "Closed Date"                    "Agency"                        
 [5] "Agency Name"                    "Complaint Type"                
 [7] "Descriptor"                     "Location Type"                 
 [9] "Incident Zip"                   "Incident Address"              
[11] "Street Name"                    "Cross Street 1"                
[13] "Cross Street 2"                 "Intersection Street 1"         
[15] "Intersection Street 2"          "Address Type"                  
[17] "City"                           "Landmark"                      
[19] "Facility Type"                  "Status"                        
[21] "Due Date"                       "Resolution Description"        
[23] "Resolution Action Updated Date" "Community Board"               
[25] "BBL"                            "Borough"                       
[27] "X Coordinate (State Plane)"     "Y Coordinate (State Plane)"    
[29] "Open Data Channel Type"         "Park Facility Name"            
[31] "Park Borough"                   "Vehicle Type"                  
[33] "Taxi Company Borough"           "Taxi Pick Up Location"         
[35] "Bridge Highway Name"            "Bridge Highway Direction"      
[37] "Road Ramp"                      "Bridge Highway Segment"        
[39] "Latitude"                       "Longitude"                     
[41] "Location"                       "CreatedDate"                   
cat("\n")
cat("Data types:\n")
Data types:
print(sapply(sr311_raw , class))
$`Unique Key`
[1] "integer"

$`Created Date`
[1] "character"

$`Closed Date`
[1] "character"

$Agency
[1] "character"

$`Agency Name`
[1] "character"

$`Complaint Type`
[1] "character"

$Descriptor
[1] "character"

$`Location Type`
[1] "character"

$`Incident Zip`
[1] "integer"

$`Incident Address`
[1] "character"

$`Street Name`
[1] "character"

$`Cross Street 1`
[1] "character"

$`Cross Street 2`
[1] "character"

$`Intersection Street 1`
[1] "character"

$`Intersection Street 2`
[1] "character"

$`Address Type`
[1] "character"

$City
[1] "character"

$Landmark
[1] "character"

$`Facility Type`
[1] "character"

$Status
[1] "character"

$`Due Date`
[1] "logical"

$`Resolution Description`
[1] "character"

$`Resolution Action Updated Date`
[1] "character"

$`Community Board`
[1] "character"

$BBL
[1] "integer"

$Borough
[1] "character"

$`X Coordinate (State Plane)`
[1] "character"

$`Y Coordinate (State Plane)`
[1] "character"

$`Open Data Channel Type`
[1] "character"

$`Park Facility Name`
[1] "character"

$`Park Borough`
[1] "character"

$`Vehicle Type`
[1] "logical"

$`Taxi Company Borough`
[1] "logical"

$`Taxi Pick Up Location`
[1] "logical"

$`Bridge Highway Name`
[1] "logical"

$`Bridge Highway Direction`
[1] "logical"

$`Road Ramp`
[1] "logical"

$`Bridge Highway Segment`
[1] "logical"

$Latitude
[1] "numeric"

$Longitude
[1] "numeric"

$Location
[1] "character"

$CreatedDate
[1] "IDate" "Date" 
cat("\n")
cat("First 5 records:\n")
First 5 records:
print(head(sr311_raw , 5))
   Unique Key           Created Date Closed Date Agency
        <int>                 <char>      <char> <char>
1:   66934900 11/23/2025 11:58:19 PM                HPD
2:   66932786 11/23/2025 11:55:32 PM                HPD
3:   66931601 11/23/2025 11:52:04 PM                HPD
4:   66929338 11/23/2025 11:51:49 PM                HPD
5:   66933739 11/23/2025 11:48:51 PM                HPD
                                          Agency Name Complaint Type
                                               <char>         <char>
1: Department of Housing Preservation and Development HEAT/HOT WATER
2: Department of Housing Preservation and Development       PLUMBING
3: Department of Housing Preservation and Development HEAT/HOT WATER
4: Department of Housing Preservation and Development HEAT/HOT WATER
5: Department of Housing Preservation and Development HEAT/HOT WATER
        Descriptor        Location Type Incident Zip     Incident Address
            <char>               <char>        <int>               <char>
1: ENTIRE BUILDING RESIDENTIAL BUILDING        10028 425 EAST   84 STREET
2:  BATHTUB/SHOWER RESIDENTIAL BUILDING        10040 121 FT GEORGE AVENUE
3: ENTIRE BUILDING RESIDENTIAL BUILDING        10040 121 FT GEORGE AVENUE
4: ENTIRE BUILDING RESIDENTIAL BUILDING        10028 425 EAST   84 STREET
5: ENTIRE BUILDING RESIDENTIAL BUILDING        10025 242 WEST  109 STREET
        Street Name Cross Street 1 Cross Street 2 Intersection Street 1
             <char>         <char>         <char>                <char>
1: EAST   84 STREET                                                    
2: FT GEORGE AVENUE                                                    
3: FT GEORGE AVENUE                                                    
4: EAST   84 STREET                                                    
5: WEST  109 STREET                                                    
   Intersection Street 2 Address Type     City Landmark Facility Type Status
                  <char>       <char>   <char>   <char>        <char> <char>
1:                            ADDRESS NEW YORK                          Open
2:                            ADDRESS NEW YORK                          Open
3:                            ADDRESS NEW YORK                          Open
4:                            ADDRESS NEW YORK                          Open
5:                            ADDRESS NEW YORK                          Open
   Due Date
     <lgcl>
1:       NA
2:       NA
3:       NA
4:       NA
5:       NA
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Resolution Description
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <char>
1: This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
2:                                                                                                                                                         The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
3: This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
4:                                                                                                                                                         The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
5: This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
   Resolution Action Updated Date Community Board        BBL   Borough
                           <char>          <char>      <int>    <char>
1:         11/23/2025 12:00:00 AM    08 MANHATTAN 1015640012 MANHATTAN
2:         11/23/2025 12:00:00 AM    12 MANHATTAN 1021490265 MANHATTAN
3:         11/23/2025 12:00:00 AM    12 MANHATTAN 1021490265 MANHATTAN
4:         11/23/2025 12:00:00 AM    08 MANHATTAN 1015640012 MANHATTAN
5:         11/23/2025 12:00:00 AM    07 MANHATTAN 1018800053 MANHATTAN
   X Coordinate (State Plane) Y Coordinate (State Plane) Open Data Channel Type
                       <char>                     <char>                 <char>
1:                    998,207                    221,730                 ONLINE
2:                  1,004,446                    251,683                  PHONE
3:                  1,004,446                    251,683                  PHONE
4:                    998,207                    221,730                 ONLINE
5:                    993,732                    231,824                  PHONE
   Park Facility Name Park Borough Vehicle Type Taxi Company Borough
               <char>       <char>       <lgcl>               <lgcl>
1:        Unspecified    MANHATTAN           NA                   NA
2:        Unspecified    MANHATTAN           NA                   NA
3:        Unspecified    MANHATTAN           NA                   NA
4:        Unspecified    MANHATTAN           NA                   NA
5:        Unspecified    MANHATTAN           NA                   NA
   Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp
                  <lgcl>              <lgcl>                   <lgcl>    <lgcl>
1:                    NA                  NA                       NA        NA
2:                    NA                  NA                       NA        NA
3:                    NA                  NA                       NA        NA
4:                    NA                  NA                       NA        NA
5:                    NA                  NA                       NA        NA
   Bridge Highway Segment Latitude Longitude
                   <lgcl>    <num>     <num>
1:                     NA 40.77526 -73.94961
2:                     NA 40.85746 -73.92699
3:                     NA 40.85746 -73.92699
4:                     NA 40.77526 -73.94961
5:                     NA 40.80297 -73.96575
                                  Location CreatedDate
                                    <char>      <IDat>
1: (40.77526292188198, -73.94960785720805)  2025-11-23
2: (40.85746342279737, -73.92699164532567)  2025-11-23
3: (40.85746342279737, -73.92699164532567)  2025-11-23
4: (40.77526292188198, -73.94960785720805)  2025-11-23
5: (40.80297417373674, -73.96575071974307)  2025-11-23
cat("\n")
cat("Summary statistics:\n")
Summary statistics:
print(summary(sr311_raw [, .(`Unique Key`, `Created Date`, `Closed Date`,
                          Agency, `Complaint Type`, Status,
                          Latitude, Longitude)]))
   Unique Key       Created Date       Closed Date           Agency         
 Min.   :52939155   Length:425248      Length:425248      Length:425248     
 1st Qu.:56470870   Class :character   Class :character   Class :character  
 Median :59938206   Mode  :character   Mode  :character   Mode  :character  
 Mean   :59892593                                                           
 3rd Qu.:63464249                                                           
 Max.   :66935006                                                           
 Complaint Type        Status             Latitude       Longitude     
 Length:425248      Length:425248      Min.   :40.69   Min.   :-74.02  
 Class :character   Class :character   1st Qu.:40.77   1st Qu.:-73.97  
 Mode  :character   Mode  :character   Median :40.81   Median :-73.95  
                                       Mean   :40.80   Mean   :-73.96  
                                       3rd Qu.:40.83   3rd Qu.:-73.94  
                                       Max.   :40.88   Max.   :-73.91  
cat("\n")
# ========== KEY DISTRIBUTIONS ==========
cat("\n===== KEY DISTRIBUTIONS =====\n\n")

===== KEY DISTRIBUTIONS =====
cat("Housing Violations - Top 10 Current Status:\n")
Housing Violations - Top 10 Current Status:
print(housing_raw[, .N, by=CurrentStatus][order(-N)][1:10])
                               CurrentStatus      N
                                      <char>  <int>
 1:                      VIOLATION DISMISSED 197906
 2:                         VIOLATION CLOSED 180829
 3:                             NOV SENT OUT 112090
 4: FIRST NO ACCESS TO RE- INSPECT VIOLATION  18629
 5:                        NOT COMPLIED WITH  17976
 6:                        INFO NOV SENT OUT  12927
 7:        NOTICE OF ISSUANCE SENT TO TENANT  11160
 8:            VIOLATION WILL BE REINSPECTED  10263
 9:                             CIV14 MAILED   7244
10:                     DEFECT LETTER ISSUED   5934
cat("\n")
cat("311 Complaints - Top 10 Complaint Types:\n")
311 Complaints - Top 10 Complaint Types:
print(sr311_raw [, .N, by=`Complaint Type`][order(-N)][1:10])
                   Complaint Type      N
                           <char>  <int>
 1:                HEAT/HOT WATER 229222
 2:                      PLUMBING  50637
 3:                 PAINT/PLASTER  44624
 4:                    WATER LEAK  30190
 5: General Construction/Plumbing  25207
 6:       Maintenance or Facility  20202
 7:                      Elevator  16588
 8:                      Plumbing   2197
 9:                      ELEVATOR   1302
10:          Non-Residential Heat   1094
cat("\n")
cat("311 Complaints - Status distribution:\n")
311 Complaints - Status distribution:
print(sr311_raw [, .N, by=Status][order(-N)])
        Status      N
        <char>  <int>
1:      Closed 419047
2:        Open   5152
3: In Progress    922
4:    Assigned    124
5: Unspecified      3
cat("\n")
cat("311 Complaints - Top Agencies:\n")
311 Complaints - Top Agencies:
print(sr311_raw [, .N, by=Agency][order(-N)][1:10])
    Agency      N
    <char>  <int>
 1:    HPD 358603
 2:    DOB  43687
 3:    DPR  20202
 4:  DOHMH   1579
 5:    DOE   1064
 6:    DEP     99
 7:    DOT     14
 8:   <NA>     NA
 9:   <NA>     NA
10:   <NA>     NA
library(data.table)

# Just load the filtered datasets we already have
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw  <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")

cat("Working with FULL datasets:\n")
Working with FULL datasets:
cat("Housing Violations:", nrow(housing_raw), "rows x", ncol(housing_raw), "columns\n")
Housing Violations: 589005 rows x 41 columns
cat("311 Complaints:", nrow(sr311_raw ), "rows x", ncol(sr311_raw ), "columns\n")
311 Complaints: 425248 rows x 42 columns
library(data.table)
library(ggplot2)

set.seed(2025)

# Load datasets
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw  <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")

# ========== MISSING VALUE ANALYSIS ==========

# === HOUSING VIOLATIONS ===
cat("===== HOUSING VIOLATIONS - MISSING VALUES =====\n\n")
===== HOUSING VIOLATIONS - MISSING VALUES =====
# Calculate missing counts and percentages
missing_viol <- data.table(
  Column = names(housing_raw),
  Missing_Count = sapply(housing_raw, function(x) sum(is.na(x) | x == "")),
  Total_Rows = nrow(housing_raw)
)
missing_viol[, Missing_Pct := round(Missing_Count / Total_Rows * 100, 2)]
missing_viol <- missing_viol[order(-Missing_Pct)]

cat("Columns with missing values:\n")
Columns with missing values:
print(missing_viol[Missing_Pct > 0])
                   Column Missing_Count Total_Rows Missing_Pct
                   <char>         <int>      <int>       <num>
 1:      NewCertifyByDate        585410     589005       99.39
 2:      NewCorrectByDate        585410     589005       99.39
 3:         CertifiedDate        360194     589005       61.15
 4:             Apartment        198543     589005       33.71
 5:                 Story         77094     589005       13.09
 6: OriginalCertifyByDate         34593     589005        5.87
 7: OriginalCorrectByDate         34593     589005        5.87
 8:                 NOVID         34593     589005        5.87
 9:         NOVIssuedDate         34593     589005        5.87
10:               NovType         34593     589005        5.87
11:                   BIN           305     589005        0.05
12:                   BBL           305     589005        0.05
13:              Postcode           147     589005        0.02
14:              Latitude            77     589005        0.01
15:             Longitude            77     589005        0.01
16:        CommunityBoard            77     589005        0.01
17:       CouncilDistrict            77     589005        0.01
18:           CensusTract            77     589005        0.01
19:                   NTA            77     589005        0.01
cat("\nColumns with >50% missing:\n")

Columns with >50% missing:
print(missing_viol[Missing_Pct > 50])
             Column Missing_Count Total_Rows Missing_Pct
             <char>         <int>      <int>       <num>
1: NewCertifyByDate        585410     589005       99.39
2: NewCorrectByDate        585410     589005       99.39
3:    CertifiedDate        360194     589005       61.15
# === 311 COMPLAINTS ===
cat("\n\n===== 311 HOUSING COMPLAINTS - MISSING VALUES =====\n\n")


===== 311 HOUSING COMPLAINTS - MISSING VALUES =====
# Calculate missing counts and percentages
missing_311 <- data.table(
  Column = names(sr311_raw ),
  Missing_Count = sapply(sr311_raw , function(x) sum(is.na(x) | x == "")),
  Total_Rows = nrow(sr311_raw )
)
missing_311[, Missing_Pct := round(Missing_Count / Total_Rows * 100, 2)]
missing_311 <- missing_311[order(-Missing_Pct)]

cat("Columns with missing values:\n")
Columns with missing values:
print(missing_311[Missing_Pct > 0])
                            Column Missing_Count Total_Rows Missing_Pct
                            <char>         <int>      <int>       <num>
 1:                       Due Date        425248     425248      100.00
 2:                   Vehicle Type        425248     425248      100.00
 3:           Taxi Company Borough        425248     425248      100.00
 4:          Taxi Pick Up Location        425248     425248      100.00
 5:            Bridge Highway Name        425248     425248      100.00
 6:       Bridge Highway Direction        425248     425248      100.00
 7:                      Road Ramp        425248     425248      100.00
 8:         Bridge Highway Segment        425248     425248      100.00
 9:                  Facility Type        417587     425248       98.20
10:                 Cross Street 2        400618     425248       94.21
11:          Intersection Street 2        400618     425248       94.21
12:                 Cross Street 1        400546     425248       94.19
13:          Intersection Street 1        400546     425248       94.19
14:                       Landmark        400506     425248       94.18
15:                  Location Type         43668     425248       10.27
16:                    Closed Date          5151     425248        1.21
17:         Resolution Description          2417     425248        0.57
18:                            BBL          2302     425248        0.54
19: Resolution Action Updated Date           902     425248        0.21
cat("\nColumns with >50% missing:\n")

Columns with >50% missing:
print(missing_311[Missing_Pct > 50])
                      Column Missing_Count Total_Rows Missing_Pct
                      <char>         <int>      <int>       <num>
 1:                 Due Date        425248     425248      100.00
 2:             Vehicle Type        425248     425248      100.00
 3:     Taxi Company Borough        425248     425248      100.00
 4:    Taxi Pick Up Location        425248     425248      100.00
 5:      Bridge Highway Name        425248     425248      100.00
 6: Bridge Highway Direction        425248     425248      100.00
 7:                Road Ramp        425248     425248      100.00
 8:   Bridge Highway Segment        425248     425248      100.00
 9:            Facility Type        417587     425248       98.20
10:           Cross Street 2        400618     425248       94.21
11:    Intersection Street 2        400618     425248       94.21
12:           Cross Street 1        400546     425248       94.19
13:    Intersection Street 1        400546     425248       94.19
14:                 Landmark        400506     425248       94.18
# Save results for visualization
fwrite(missing_viol, "datasets/missing_values_violations.csv")
fwrite(missing_311, "datasets/missing_values_311.csv")

cat("\n\nMissing value summaries saved!\n")


Missing value summaries saved!

Missing Value Analysis - Code

library(data.table)
library(ggplot2)

set.seed(2025)

# Load missing value summaries
missing_viol <- fread("datasets/missing_values_violations.csv")
missing_311 <- fread("datasets/missing_values_311.csv")

# ========== GRAPH 1: BAR CHART - HOUSING VIOLATIONS ==========

# Filter to columns with >0% missing
missing_viol_plot <- missing_viol[Missing_Pct > 0]

# Create bar chart
p1 <- ggplot(missing_viol_plot, aes(x = reorder(Column, Missing_Pct), y = Missing_Pct)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Missing Values in Housing Violations Dataset",
    subtitle = "Manhattan, 2022-2025",
    x = "Column Name",
    y = "Percentage Missing (%)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text.y = element_text(size = 11)
  )

print(p1)

# ========== GRAPH 2: BAR CHART - 311 COMPLAINTS ==========

# Filter to columns with >0% missing
missing_311_plot <- missing_311[Missing_Pct > 0]

# Create bar chart
p2 <- ggplot(missing_311_plot, aes(x = reorder(Column, Missing_Pct), y = Missing_Pct)) +
  geom_bar(stat = "identity", fill = "coral") +
  coord_flip() +
  labs(
    title = "Missing Values in 311 Housing Complaints Dataset",
    subtitle = "Manhattan, 2022-2025",
    x = "Column Name",
    y = "Percentage Missing (%)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text.y = element_text(size = 11)
  )

print(p2)

# ========== SUMMARY STATISTICS ==========
cat("\n===== KEY FINDINGS =====\n\n")

===== KEY FINDINGS =====
cat("HOUSING VIOLATIONS:\n")
HOUSING VIOLATIONS:
cat("- Total columns:", nrow(missing_viol), "\n")
- Total columns: 41 
cat("- Columns with missing data:", nrow(missing_viol[Missing_Pct > 0]), "\n")
- Columns with missing data: 19 
cat("- Columns >50% missing:", nrow(missing_viol[Missing_Pct > 50]), "\n")
- Columns >50% missing: 3 
cat("- Columns 100% complete:", nrow(missing_viol[Missing_Pct == 0]), "\n\n")
- Columns 100% complete: 21 
cat("311 COMPLAINTS:\n")
311 COMPLAINTS:
cat("- Total columns:", nrow(missing_311), "\n")
- Total columns: 42 
cat("- Columns with missing data:", nrow(missing_311[Missing_Pct > 0]), "\n")
- Columns with missing data: 19 
cat("- Columns >50% missing:", nrow(missing_311[Missing_Pct > 50]), "\n")
- Columns >50% missing: 14 
cat("- Columns 100% missing:", nrow(missing_311[Missing_Pct == 100]), "\n")
- Columns 100% missing: 8 
cat("- Columns 100% complete:", nrow(missing_311[Missing_Pct == 0]), "\n")
- Columns 100% complete: 22 

2.2 Missing Value Analysis

The missing value patterns in both datasets reveal important characteristics about data collection and reporting practices.

Housing Violations Dataset

The Housing Violations dataset contains 589,005 records with 41 columns. Of these, 19 columns have missing values while 22 columns are complete. Three columns have more than 50% missing data:

  • NewCertifyByDate and NewCorrectByDate (99.4% missing): These fields are only populated when violations are re-certified or correction deadlines are extended, indicating that most violations follow the original timeline without modifications.
  • CertifiedDate (61.2% missing): This high percentage reflects that many violations remain open or were dismissed without certification, consistent with the status distribution showing significant numbers of dismissed and uncertified violations.

Additional notable patterns include 33.7% missing apartment numbers, likely representing violations that affect entire buildings or common areas rather than specific units. Geographic fields (Latitude, Longitude, Community Board) show minimal missingness (<0.02%), demonstrating excellent geocoding coverage for Manhattan addresses.

311 Housing Complaints Dataset

The 311 dataset contains 425,248 records with 42 columns. Eight columns are completely empty (100% missing), all related to transportation categories (taxi, vehicle, bridge/highway fields) that are irrelevant to housing complaints and can be excluded from analysis.

Cross-street and intersection fields show 94% missingness, as most complaints use address-based location rather than intersection descriptions. The Facility Type field (98.2% missing) and Landmark field (94.2% missing) are rarely used for residential complaints.

Critical fields for analysis have minimal missingness: BBL (0.54%), Closed Date (1.21% - only open cases), and Resolution Description (0.57%), ensuring robust data availability for investigating complaint resolution patterns and building-level aggregation.

Enhanced Missing Value Analysis - Add Heatmap

The error is because pivot_longer() can’t combine different data types (integers, characters, dates, etc.) into one column. We need to convert everything to character first. Here’s the fix:

library(data.table)
library(tidyverse)

# Load datasets
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw  <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")

# ========== MISSING VALUE HEATMAP - HOUSING VIOLATIONS ==========

# Convert data.table to tibble and make all columns character for pivoting
viol_long <- as_tibble(housing_raw) |>
  mutate(across(everything(), as.character)) |>  # Convert all to character
  rownames_to_column("row_id") |>
  pivot_longer(cols = -row_id, names_to = "name", values_to = "value") |>
  mutate(missing = ifelse(is.na(value) | value == "", "yes", "no"))

# Plot heatmap - sample rows for visibility
ggplot(viol_long |> filter(as.numeric(row_id) <= 1000),  # First 1000 rows only
       aes(x = name, y = fct_rev(row_id), fill = missing)) +
  geom_tile(color = "white") +
  scale_fill_viridis_d() +
  labs(
    title = "Missing Data Pattern: Housing Violations (sample)",
    subtitle = "First 1000 records, Manhattan 2022-2025",
    x = "Column",
    y = "Row"
  ) +
  theme_bw(base_size = 14) +
  theme(
    axis.text.y = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 1)
  )

# ========== MISSING VALUE HEATMAP - 311 COMPLAINTS ==========

complaints_long <- as_tibble(sr311_raw ) |>
  mutate(across(everything(), as.character)) |>  # Convert all to character
  rownames_to_column("row_id") |>
  pivot_longer(cols = -row_id, names_to = "name", values_to = "value") |>
  mutate(missing = ifelse(is.na(value) | value == "", "yes", "no"))

# Plot heatmap - sample rows
ggplot(complaints_long |> filter(as.numeric(row_id) <= 1000),  # First 1000 rows
       aes(x = name, y = fct_rev(row_id), fill = missing)) +
  geom_tile(color = "white") +
  scale_fill_viridis_d() +
  labs(
    title = "Missing Data Pattern: 311 Complaints (sample)",
    subtitle = "First 1000 records, Manhattan 2022-2025",
    x = "Column",
    y = "Row"
  ) +
  theme_bw(base_size = 14) +
  theme(
    axis.text.y = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 1)
  )

EXCELLENT POINT! You’re absolutely right to question this!

Why Random Sampling? The heatmap with 589,005 rows would be:

Computationally expensive (slow to render)

Visually useless - you can’t see individual tiles when there are 589K tiny rows on the y-axis

The image would be a giant blur

BUT - you already have the complete missing value analysis in your bar charts! Those show the exact percentages for all rows.

Better Question: Do You Even Need the Heatmap? Looking at the lecture slides you shared, the heatmap is useful when:

Small dataset (like mtcars with 32 rows)

You want to see patterns of missingness (e.g., “if column A is missing, column B is also missing”)

For your large datasets, the bar charts you already made are BETTER because: ✅ Show exact percentages ✅ Easy to read ✅ Already done and look great!

library(redav)

# Simple aggregated missing plot
plot_missing(housing_raw, percent = TRUE)

plot_missing(sr311_raw , percent = TRUE)

library(redav)

# ========== HOUSING VIOLATIONS - FIXED ==========
plot_missing(housing_raw, 
             percent = TRUE,
             num_char = 5, max_cols = 10)  # Shorten column names to 8 characters

# Alternative: Manually create cleaner version
# First, let's see how many columns we're dealing with
cat("Number of columns:", ncol(housing_raw))
Number of columns: 41
library(data.table)
library(ggplot2)

# Load missing value summary
missing_viol <- fread("datasets/missing_values_violations.csv")
missing_311 <- fread("datasets/missing_values_311.csv")

# Filter to only columns with significant missingness (>5%)
missing_viol_sig <- missing_viol[Missing_Pct > 5]
missing_311_sig <- missing_311[Missing_Pct > 5]

# Create subset datasets with only these columns
cols_to_keep_viol <- missing_viol_sig$Column
cols_to_keep_311 <- missing_311_sig$Column

dt_viol_subset <- housing_raw[, ..cols_to_keep_viol]
sr311_raw_subset <- sr311_raw[, ..cols_to_keep_311]

# Now plot with fewer columns (more readable)
plot_missing(dt_viol_subset, percent = TRUE)

plot_missing(sr311_raw_subset, percent = TRUE)

ask Prof about formatting or is that even required ?

Research Question: When do housing violations occur? Are there seasonal patterns (e.g., more heating violations in winter)?

What we’re trying to find:

Do violations increase during certain months?

Are there yearly trends?

Any COVID-related changes?

Graph type: Time series line chart

library(data.table)
library(ggplot2)

set.seed(2025)

# Load data
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")

# Convert date and extract time components
housing_raw[, InspectionDate := as.Date(InspectionDate)]
housing_raw[, YearMonth := format(InspectionDate, "%Y-%m")]
housing_raw[, Month := month(InspectionDate)]
housing_raw[, Year := year(InspectionDate)]

# Count violations by month and year
viol_by_month <- housing_raw[, .N, by = .(Year, Month)][order(Year, Month)]

# Create the plot
ggplot(viol_by_month, aes(x = Month, y = N, color = factor(Year), group = Year)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2.5) +
  scale_x_continuous(breaks = 1:12, 
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Housing Violations by Month and Year",
    subtitle = "Manhattan, 2022-2025",
    x = "Month",
    y = "Number of Violations",
    color = "Year"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "right"
  )

# Investigate February 2022
feb_2022 <- housing_raw[Year == 2022 & Month == 2]
cat("February 2022 violations:", nrow(feb_2022), "\n")
February 2022 violations: 28414 
# Check if they're all inspected in Feb or entered in Feb
summary(feb_2022$InspectionDate)
        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"2022-02-01" "2022-02-15" "2022-02-15" "2022-02-15" "2022-02-16" "2022-02-28" 
summary(feb_2022$ApprovedDate)
   Length     Class      Mode 
    28414 character character 

🔍 What the Data Reveals: February 2022 has 28,414 violations - and looking at the dates:

Median/Mean inspection date: February 15, 2022

Most inspections clustered around Feb 15-16

This is NOT a data error - this appears to be a mass inspection event or systematic sweep by HPD in mid-February 2022.

Possible Explanations: Annual inspection cycle - HPD may conduct building-wide inspections at this time

Post-winter inspection sweep - checking heating violations after winter

Administrative/policy change - new enforcement initiative started

Backlog processing - accumulated complaints processed together

Updated Commentary for Graph: “Housing violations show notable spikes in early 2022 (particularly February with ~29,000 violations), likely reflecting systematic inspection efforts by HPD. Seasonal patterns are less pronounced than expected, with violations remaining relatively stable across months at 10,000-12,000 per month after the initial 2022 surge. The 2025 data shows incomplete year-to-date figures through November. This suggests violations are driven more by inspection scheduling and enforcement policy than by seasonal factors like winter heating issues.”

Graph 2: Smoothed Temporal View Research Question: What are the overall trends when we smooth out monthly fluctuations?

library(data.table)
library(ggplot2)

# Aggregate by quarter for smoother trend
housing_raw[, Quarter := quarter(InspectionDate)]
housing_raw[, YearQuarter := paste0(Year, "-Q", Quarter)]

viol_by_quarter <- housing_raw[, .N, by = .(Year, Quarter, YearQuarter)][order(Year, Quarter)]

# Create quarter labels
viol_by_quarter[, QuarterLabel := paste0(Year, "\nQ", Quarter)]

# Plot
ggplot(viol_by_quarter, aes(x = factor(YearQuarter, levels = YearQuarter), 
                              y = N, group = 1)) +
  geom_line(color = "steelblue", linewidth = 1.2) +
  geom_point(color = "steelblue", size = 3) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Housing Violations by Quarter",
    subtitle = "Manhattan, 2022-2025 (smoothed trend)",
    x = "Quarter",
    y = "Number of Violations"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

The quarterly graph shows a much clearer trend - violations started high in Q1 2022 (~48K), dropped, then fluctuated between 30K-47K per quarter, with a recent decline in Q4 2025.

Key Patterns: Sharp Drop After Q1 2022

Q1 2022: ~48,000 violations

Q2 2022: ~30,000 violations (37% decrease)

This confirms the Feb 2022 spike was a one-time event

Cyclical Pattern

Violations rise and fall in waves

Peaks: Q4 2022 (~41K), Q2 2024 (~47K), Q3 2025 (~41K)

Troughs: Q2 2022 (~30K), Q1 2025 (~30K)

No Clear Seasonality

No consistent “winter spike” for heating violations

Fluctuations appear more related to enforcement cycles than seasons

Recent Decline

Q4 2025: ~26,000 (lowest point)

Could be incomplete data (we’re currently in November 2025)

OR enforcement has genuinely decreased

Graph 3: What Types of Violations Are Most Common? Research Question: What are the most common housing violations that tenants and inspectors encounter?

library(data.table)
library(ggplot2)
library(stringr)

# Check what violation type columns we have
cat("Checking violation classification columns:\n")
Checking violation classification columns:
cat("CurrentStatus values:\n")
CurrentStatus values:
print(head(housing_raw[, .N, by=CurrentStatus][order(-N)], 10))
                               CurrentStatus      N
                                      <char>  <int>
 1:                      VIOLATION DISMISSED 197906
 2:                         VIOLATION CLOSED 180829
 3:                             NOV SENT OUT 112090
 4: FIRST NO ACCESS TO RE- INSPECT VIOLATION  18629
 5:                        NOT COMPLIED WITH  17976
 6:                        INFO NOV SENT OUT  12927
 7:        NOTICE OF ISSUANCE SENT TO TENANT  11160
 8:            VIOLATION WILL BE REINSPECTED  10263
 9:                             CIV14 MAILED   7244
10:                     DEFECT LETTER ISSUED   5934
cat("\n\nViolationStatus values:\n")


ViolationStatus values:
print(head(housing_raw[, .N, by=ViolationStatus][order(-N)], 10))
   ViolationStatus      N
            <char>  <int>
1:           Close 386382
2:            Open 202623
cat("\n\nClass values:\n")


Class values:
print(head(housing_raw[, .N, by=Class][order(-N)], 10))
    Class      N
   <char>  <int>
1:      B 237753
2:      C 184746
3:      A 132840
4:      I  33666
cat("\n\nRentImpairing values:\n")


RentImpairing values:
print(housing_raw[, .N, by=RentImpairing])
   RentImpairing      N
          <char>  <int>
1:             N 542652
2:             Y  46353
# NOVDescription is too detailed, let's extract key violation types
# Show a few examples first
cat("\n\nSample NOVDescription:\n")


Sample NOVDescription:
print(head(housing_raw$NOVDescription, 5))
[1] "(A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS."
[2] "§ 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH"                          
[3] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST"                                               
[4] "§ 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD"                                                                                 
[5] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST"                                            

Graph 3: Violation Severity and Status Research Question: What types of violations are most common, and how seriously are they being addressed?

library(data.table)
library(ggplot2)

# === GRAPH 3A: Violation Class Distribution ===
# Class definitions: A=Non-Hazardous, B=Hazardous, C=Immediately Hazardous, I=Failure to Register

class_data <- housing_raw[, .N, by=Class][order(-N)]

# Add descriptive labels
class_data[, ClassLabel := fcase(
  Class == "A", "Class A: Non-Hazardous",
  Class == "B", "Class B: Hazardous",
  Class == "C", "Class C: Immediately Hazardous",
  Class == "I", "Class I: Failure to Register",
  default = Class
)]

ggplot(class_data, aes(x = reorder(ClassLabel, N), y = N, fill = Class)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::comma(N)), hjust = -0.1, size = 5) +
  coord_flip() +
  scale_fill_manual(values = c("A" = "#90EE90", "B" = "#FFD700", 
                                "C" = "#FF6347", "I" = "#87CEEB")) +
  scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Housing Violations by Severity Class",
    subtitle = "Manhattan, 2022-2025",
    x = "Violation Class",
    y = "Number of Violations",
    caption = "Class A: Non-hazardous | Class B: Hazardous | Class C: Immediately Hazardous"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "none"
  )

# === GRAPH 3B: Rent-Impairing Violations ===

rent_data <- housing_raw[, .N, by=RentImpairing]
rent_data[, Label := ifelse(RentImpairing == "Y", 
                             "Rent-Impairing\n(affects habitability)", 
                             "Non-Rent-Impairing")]

ggplot(rent_data, aes(x = Label, y = N, fill = RentImpairing)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(scales::comma(N), "\n(", 
                                round(N/sum(N)*100, 1), "%)")), 
            vjust = -0.5, size = 5) +
  scale_fill_manual(values = c("N" = "steelblue", "Y" = "darkred")) +
  scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "Rent-Impairing vs Non-Rent-Impairing Violations",
    subtitle = "Manhattan, 2022-2025",
    x = "",
    y = "Number of Violations"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "none"
  )

Excellent! Both graphs look great! Let me analyze them:​

📊 Graph 3A: Violation Severity Analysis Key Findings:​

Class B (Hazardous): 237,753 violations (40.4%) - MOST COMMON

Class C (Immediately Hazardous): 184,746 (31.4%) - VERY SERIOUS

Class A (Non-Hazardous): 132,840 (22.6%)

Class I (Failure to Register): 33,666 (5.7%)

Commentary: “Over 70% of violations are classified as hazardous (Class B) or immediately hazardous (Class C), indicating serious safety and habitability concerns in Manhattan housing. Class B violations, which require correction within 30 days, dominate the landscape. This high proportion of serious violations underscores the importance of thorough building inspection before renting.”

📊 Graph 3B: Rent-Impairing Analysis Key Findings:​

Non-Rent-Impairing: 542,652 (92.1%) - Vast majority

Rent-Impairing: 46,353 (7.9%) - Serious habitability issues

Commentary: “Only 7.9% of violations are classified as rent-impairing, meaning they directly affect habitability and could legally justify rent withholding. While most violations are non-rent-impairing, the 46,000+ rent-impairing violations represent conditions severe enough to make apartments legally uninhabitable, such as lack of heat, hot water, or structural hazards.”

Cleaning

library(data.table)
library(stringr)

# Load data
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")

# First, let's see what's in NOVDescription
cat("Sample NOVDescription text:\n")
Sample NOVDescription text:
print(head(housing_raw$NOVDescription, 10))
 [1] "(A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS."                                 
 [2] "§ 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH"                                                           
 [3] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST"                                                                                
 [4] "§ 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD"                                                                                                                  
 [5] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST"                                                                             
 [6] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"                                                             
 [7] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING LOCATED AT APT 2A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH"                                                                                                        
 [8] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"                                                            
 [9] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"                                                                                
[10] "§ 27-2005 ADM CODE REPAIR THE BROKEN OR DEFECTIVE PLASTERED SURFACES AND PAINT IN A UNIFORM COLOR EAST WALL APPROX 4SQ FT AND NORTH WALL APPROX 4 SQ FT IN THE BATHROOM LOCATED AT APT 1A, 1st STORY, 2nd APARTMENT FROM SOUTH AT WEST"
# Extract key violation keywords from descriptions
housing_raw[, ViolationCategory := fcase(
  # Heating/Hot Water
  str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)), 
  "HEAT/HOT WATER",
  
  # Plumbing
  str_detect(NOVDescription, regex("plumbing|pipe|leak|faucet|drain|water supply|washbasin", ignore_case = TRUE)), 
  "PLUMBING",
  
  # Paint/Plaster
  str_detect(NOVDescription, regex("paint|plaster|peeling|wall|ceiling", ignore_case = TRUE)), 
  "PAINT/PLASTER",
  
  # Water Leak (specific)
  str_detect(NOVDescription, regex("water leak|leaking|water damage", ignore_case = TRUE)), 
  "WATER LEAK",
  
  # Elevator
  str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)), 
  "ELEVATOR",
  
  # Mold
  str_detect(NOVDescription, regex("mold|mildew|moisture", ignore_case = TRUE)), 
  "MOLD",
  
  # Building Structure
  str_detect(NOVDescription, regex("structural|building condition|facade|exterior wall", ignore_case = TRUE)), 
  "BUILDING CONDITION",
  
  # Electrical
  str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)), 
  "ELECTRICAL",
  
  # Pest/Sanitation
  str_detect(NOVDescription, regex("rodent|pest|bedbug|garbage|sanitation", ignore_case = TRUE)), 
  "PEST/SANITATION",
  
  # Door/Window
  str_detect(NOVDescription, regex("door|window|lock|self-closing", ignore_case = TRUE)), 
  "DOOR/WINDOW",
  
  # Default
  default = "OTHER"
)]

# Check the distribution
cat("\nExtracted Violation Categories:\n")

Extracted Violation Categories:
print(housing_raw[, .N, by=ViolationCategory][order(-N)])
     ViolationCategory      N
                <char>  <int>
 1:              OTHER 183558
 2:      PAINT/PLASTER 176654
 3:        DOOR/WINDOW  92915
 4:    PEST/SANITATION  48138
 5:           PLUMBING  43787
 6:     HEAT/HOT WATER  39992
 7:         ELECTRICAL   2187
 8:               MOLD   1346
 9:           ELEVATOR    267
10: BUILDING CONDITION     87
11:         WATER LEAK     74
# Compare with 311 Complaint Types
sr311_raw  <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")
cat("\n\n311 Complaint Types:\n")


311 Complaint Types:
print(sr311_raw [, .N, by=`Complaint Type`][order(-N)])
                   Complaint Type      N
                           <char>  <int>
 1:                HEAT/HOT WATER 229222
 2:                      PLUMBING  50637
 3:                 PAINT/PLASTER  44624
 4:                    WATER LEAK  30190
 5: General Construction/Plumbing  25207
 6:       Maintenance or Facility  20202
 7:                      Elevator  16588
 8:                      Plumbing   2197
 9:                      ELEVATOR   1302
10:          Non-Residential Heat   1094
11:            School Maintenance   1064
12:              OUTSIDE BUILDING    857
13:                    Water Leak    556
14:                Heat/Hot Water    487
15:                 Paint/Plaster    404
16:                          Mold    402
17:  Building Drinking Water Tank     83
18:             Sewer Maintenance     68
19:             Water Maintenance     31
20:            Building Condition     19
21:             Unstable Building     14
                   Complaint Type      N
library(data.table)
library(stringr)

# Load data
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")

# STEP 1: Create ViolationCategory column first
housing_raw[, ViolationCategory := fcase(
  # Heating/Hot Water
  str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)), 
  "HEAT/HOT WATER",
  
  # Plumbing
  str_detect(NOVDescription, regex("plumbing|pipe|faucet|drain|water supply|washbasin", ignore_case = TRUE)), 
  "PLUMBING",
  
  # Paint/Plaster
  str_detect(NOVDescription, regex("paint|plaster|peeling|wall|ceiling", ignore_case = TRUE)), 
  "PAINT/PLASTER",
  
  # Water Leak (specific)
  str_detect(NOVDescription, regex("leak", ignore_case = TRUE)), 
  "WATER LEAK",
  
  # Elevator
  str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)), 
  "ELEVATOR",
  
  # Mold
  str_detect(NOVDescription, regex("mold|mildew", ignore_case = TRUE)), 
  "MOLD",
  
  # Building Structure
  str_detect(NOVDescription, regex("structural|facade|exterior", ignore_case = TRUE)), 
  "BUILDING CONDITION",
  
  # Electrical
  str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)), 
  "ELECTRICAL",
  
  # Pest/Sanitation
  str_detect(NOVDescription, regex("rodent|pest|bedbug|garbage|sanitation", ignore_case = TRUE)), 
  "PEST/SANITATION",
  
  # Door/Window
  str_detect(NOVDescription, regex("door|window|lock|self-closing", ignore_case = TRUE)), 
  "DOOR/WINDOW",
  
  # Default
  default = "OTHER"
)]

# STEP 2: Now check distribution
cat("Extracted Violation Categories:\n")
Extracted Violation Categories:
print(housing_raw[, .N, by=ViolationCategory][order(-N)])
     ViolationCategory      N
                <char>  <int>
 1:      PAINT/PLASTER 207685
 2:              OTHER 183562
 3:        DOOR/WINDOW  92596
 4:    PEST/SANITATION  48140
 5:     HEAT/HOT WATER  39992
 6:           PLUMBING  11015
 7:         ELECTRICAL   2187
 8:         WATER LEAK   1741
 9:               MOLD   1344
10: BUILDING CONDITION    475
11:           ELEVATOR    268
# STEP 3: Check what's in OTHER
cat("\n\nSample 'OTHER' violations (first 10):\n")


Sample 'OTHER' violations (first 10):
print(head(housing_raw[ViolationCategory == "OTHER", NOVDescription], 10))
 [1] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"                                  
 [2] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING LOCATED AT APT 2A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH"                                                                             
 [3] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"                                 
 [4] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"                                                     
 [5] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 51, 5th STORY, 1st APARTMENT FROM SOUTH AT WEST"                                 
 [6] "HMC ADM CODE: § 27-2017.4 ABATE THE INFESTATION CONSISTING OF ROACHES IN THE ENTIRE APARTMENT LOCATED AT APT 1A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH"                                               
 [7] "§ 27-2005 ADM CODE PROPERLY REPAIR WITH SIMILAR MATERIAL THE BROKEN OR DEFECTIVE WOOD FLOOR CRACKED,SAGGING, NEAR THE BATHROOM. IN THE FOYER LOCATED AT APT 5F, 5th STORY, 2nd APARTMENT FROM EAST AT SOUTH"
 [8] "§ 27-2005 ADM CODE PROPERLY REPAIR WITH SIMILAR MATERIAL THE BROKEN OR DEFECTIVE CERAMIC TILE AT FLOOR. IN THE BATHROOM LOCATED AT APT 5F, 5th STORY, 2nd APARTMENT FROM EAST AT SOUTH"                     
 [9] "HMC ADM CODE: § 27-2017.4 ABATE THE INFESTATION CONSISTING OF ROACHES LOCATED AT APT 1C, 1st STORY, APARTMENT AT EAST"                                                                                      
[10] "§ 27-2053 ADM CODE PROVIDE DWELLING WITH A JANITOR OR RESPONSIBLE PERSON OR JANITORIAL SERVICE."                                                                                                            
# STEP 4: Check Class distribution in OTHER
cat("\n\nClass distribution in OTHER:\n")


Class distribution in OTHER:
print(housing_raw[ViolationCategory == "OTHER", .N, by=Class][order(-N)])
    Class     N
   <char> <int>
1:      B 83067
2:      C 53878
3:      I 33666
4:      A 12951
library(data.table)
library(stringr)

housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")

# Improved categorization
housing_raw[, ViolationCategory := fcase(
  # Registration (Class I)
  Class == "I" | str_detect(NOVDescription, regex("registration|register", ignore_case = TRUE)),
  "REGISTRATION/ADMIN",
  
  # Smoke/CO Detectors (VERY COMMON - separate category)
  str_detect(NOVDescription, regex("smoke detector|carbon monoxide|co detect", ignore_case = TRUE)),
  "SMOKE/CO DETECTOR",
  
  # Heating/Hot Water
  str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)), 
  "HEAT/HOT WATER",
  
  # Paint/Plaster/Walls/Ceilings
  str_detect(NOVDescription, regex("paint|plaster|peel|wall|ceiling", ignore_case = TRUE)), 
  "PAINT/PLASTER",
  
  # Plumbing
  str_detect(NOVDescription, regex("plumb|pipe|faucet|drain|water supply|washbasin|toilet|sink", ignore_case = TRUE)), 
  "PLUMBING",
  
  # Water Leak
  str_detect(NOVDescription, regex("leak", ignore_case = TRUE)), 
  "WATER LEAK",
  
  # Door/Window/Lock
  str_detect(NOVDescription, regex("door|window|lock|self-closing|entrance", ignore_case = TRUE)), 
  "DOOR/WINDOW/LOCK",
  
  # Pest/Sanitation
  str_detect(NOVDescription, regex("rodent|pest|bedbug|bed bug|roach|mice|rat|garbage|sanitation|infest", ignore_case = TRUE)), 
  "PEST/SANITATION",
  
  # Floor/Ceiling defects
  str_detect(NOVDescription, regex("floor|tile|carpet|wood floor|ceramic", ignore_case = TRUE)),
  "FLOOR/CEILING",
  
  # Elevator
  str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)), 
  "ELEVATOR",
  
  # Mold
  str_detect(NOVDescription, regex("mold|mildew|moisture", ignore_case = TRUE)), 
  "MOLD",
  
  # Electrical
  str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)), 
  "ELECTRICAL",
  
  # Ventilation
  str_detect(NOVDescription, regex("ventilat|airflow|exhaust fan", ignore_case = TRUE)),
  "VENTILATION",
  
  # Building Management
  str_detect(NOVDescription, regex("janitor|superintendent|building service", ignore_case = TRUE)),
  "BUILDING MANAGEMENT",
  
  # Fire Safety
  str_detect(NOVDescription, regex("fire|sprinkler|fire escape|extinguish", ignore_case = TRUE)),
  "FIRE SAFETY",
  
  # Default
  default = "OTHER"
)]

# Check new distribution
cat("REFINED Violation Categories:\n")
REFINED Violation Categories:
print(housing_raw[, .N, by=ViolationCategory][order(-N)])
      ViolationCategory      N
                 <char>  <int>
 1:       PAINT/PLASTER 206753
 2:     PEST/SANITATION  96473
 3:    DOOR/WINDOW/LOCK  95429
 4:   SMOKE/CO DETECTOR  40227
 5:      HEAT/HOT WATER  39990
 6:  REGISTRATION/ADMIN  35133
 7:               OTHER  24747
 8:       FLOOR/CEILING  23140
 9:            PLUMBING  15515
10:         FIRE SAFETY   7127
11:          ELECTRICAL   1753
12:          WATER LEAK   1731
13: BUILDING MANAGEMENT    377
14:                MOLD    363
15:         VENTILATION    166
16:            ELEVATOR     81
cat("\n\nRemaining OTHER (should be much smaller now):\n")


Remaining OTHER (should be much smaller now):
cat("Count:", nrow(housing_raw[ViolationCategory == "OTHER"]), "\n")
Count: 24747 
print(head(housing_raw[ViolationCategory == "OTHER", NOVDescription], 5))
[1] "§ 27-2026 ADM CODE RESET, SO AS TO SECURE A PROPER AND TIGHT CONNECTION, THE WATER CLOSET BOWL ... IN THE BATHROOM LOCATED AT APT 3A, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"        
[2] "§ 27-2005 ADM CODE PROPERLY REPAIR THE BROKEN OR DEFECTIVE BELL BUZZER INTERCOM SYSTEM LOCATED AT APT 3A, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"                                    
[3] "§ 27-2070 ADM CODE PROVIDE AN ADEQUATE SUPPLY OF GAS TO THE FIXTURES TO STOVE IN THE ENTIRE APARTMENT LOCATED AT APT 2B, 2nd STORY, 1st APARTMENT FROM EAST AT SOUTH"                     
[4] "§ 27-2005, 2007 ADM CODE REMOVE ALL ENCUMBRANCES CONSISTING OF BICYCLE AND HOUSEHOLD ITEMS AT PUBLIC HALL, 4th STORY"                                                                     
[5] "§ 27-2024 ADM CODE PROVIDE ADEQUATE SUPPLY OF COLD WATER FOR THE FIXTURES NO RUNNING COLD WATER. IN THE ENTIRE APARTMENT LOCATED AT APT 24F, 24th STORY, 3rd APARTMENT FROM NORTH AT EAST"
library(data.table)
library(stringr)
library(ggplot2)

housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw  <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")

# Final refined categorization
housing_raw[, ViolationCategory := fcase(
  Class == "I" | str_detect(NOVDescription, regex("registration|register", ignore_case = TRUE)),
  "REGISTRATION/ADMIN",
  
  str_detect(NOVDescription, regex("smoke detector|carbon monoxide|co detect", ignore_case = TRUE)),
  "SMOKE/CO DETECTOR",
  
  str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)), 
  "HEAT/HOT WATER",
  
  str_detect(NOVDescription, regex("paint|plaster|peel|wall|ceiling", ignore_case = TRUE)), 
  "PAINT/PLASTER",
  
  str_detect(NOVDescription, regex("plumb|pipe|faucet|drain|water supply|washbasin|toilet|sink|water closet|cold water", ignore_case = TRUE)), 
  "PLUMBING",
  
  str_detect(NOVDescription, regex("leak", ignore_case = TRUE)), 
  "WATER LEAK",
  
  str_detect(NOVDescription, regex("door|window|lock|self-closing|entrance", ignore_case = TRUE)), 
  "DOOR/WINDOW/LOCK",
  
  str_detect(NOVDescription, regex("rodent|pest|bedbug|bed bug|roach|mice|rat|garbage|sanitation|infest", ignore_case = TRUE)), 
  "PEST/SANITATION",
  
  str_detect(NOVDescription, regex("floor|tile|carpet|wood floor|ceramic", ignore_case = TRUE)),
  "FLOOR/CEILING",
  
  str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)), 
  "ELEVATOR",
  
  str_detect(NOVDescription, regex("mold|mildew|moisture", ignore_case = TRUE)), 
  "MOLD",
  
  str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)), 
  "ELECTRICAL",
  
  str_detect(NOVDescription, regex("gas|stove|appliance", ignore_case = TRUE)),
  "GAS/APPLIANCES",
  
  str_detect(NOVDescription, regex("ventilat|airflow|exhaust fan", ignore_case = TRUE)),
  "VENTILATION",
  
  str_detect(NOVDescription, regex("janitor|superintendent|building service", ignore_case = TRUE)),
  "BUILDING MANAGEMENT",
  
  str_detect(NOVDescription, regex("fire|sprinkler|fire escape|extinguish", ignore_case = TRUE)),
  "FIRE SAFETY",
  
  str_detect(NOVDescription, regex("bell|buzzer|intercom", ignore_case = TRUE)),
  "BUILDING SYSTEMS",
  
  default = "OTHER"
)]

# Comparison data
viol_counts <- housing_raw[, .N, by=ViolationCategory][order(-N)]
cat("Final Violation Categories:\n")
Final Violation Categories:
print(viol_counts)
      ViolationCategory      N
                 <char>  <int>
 1:       PAINT/PLASTER 206753
 2:    DOOR/WINDOW/LOCK  95419
 3:     PEST/SANITATION  94741
 4:   SMOKE/CO DETECTOR  40227
 5:      HEAT/HOT WATER  39990
 6:  REGISTRATION/ADMIN  35133
 7:       FLOOR/CEILING  23076
 8:            PLUMBING  19386
 9:               OTHER  18061
10:         FIRE SAFETY   7120
11:      GAS/APPLIANCES   4008
12:          ELECTRICAL   1750
13:          WATER LEAK   1723
14:    BUILDING SYSTEMS    679
15: BUILDING MANAGEMENT    377
16:                MOLD    362
17:         VENTILATION    119
18:            ELEVATOR     81
# Map 311 to standardized categories for comparison
sr311_raw [, Category := fcase(
  str_detect(`Complaint Type`, regex("HEAT|HOT WATER", ignore_case = TRUE)), "HEAT/HOT WATER",
  str_detect(`Complaint Type`, regex("PLUMBING", ignore_case = TRUE)), "PLUMBING",
  str_detect(`Complaint Type`, regex("PAINT|PLASTER", ignore_case = TRUE)), "PAINT/PLASTER",
  str_detect(`Complaint Type`, regex("WATER LEAK", ignore_case = TRUE)), "WATER LEAK",
  str_detect(`Complaint Type`, regex("ELEVATOR", ignore_case = TRUE)), "ELEVATOR",
  str_detect(`Complaint Type`, regex("MOLD", ignore_case = TRUE)), "MOLD",
  default = "OTHER"
)]

complaints_counts <- sr311_raw [, .N, by=Category][order(-N)]
cat("\n311 Mapped Categories:\n")

311 Mapped Categories:
print(complaints_counts)
         Category      N
           <char>  <int>
1: HEAT/HOT WATER 230803
2:       PLUMBING  78041
3:  PAINT/PLASTER  45028
4:     WATER LEAK  30746
5:          OTHER  22338
6:       ELEVATOR  17890
7:           MOLD    402
library(data.table)
library(ggplot2)
library(tidyr)

# Prepare data for comparison - focus on categories that exist in BOTH datasets
comparison_categories <- c("HEAT/HOT WATER", "PLUMBING", "PAINT/PLASTER", 
                          "WATER LEAK", "ELEVATOR", "MOLD")

# Get counts for violations
viol_comparison <- housing_raw[ViolationCategory %in% comparison_categories, 
                                  .(Count = .N), by = ViolationCategory]
viol_comparison[, Source := "Housing Violations"]
setnames(viol_comparison, "ViolationCategory", "Category")

# Get counts for 311 complaints
complaints_comparison <- sr311_raw [Category %in% comparison_categories, 
                                .(Count = .N), by = Category]
complaints_comparison[, Source := "311 Complaints"]

# Combine
combined <- rbind(viol_comparison, complaints_comparison)

# Create grouped bar chart
ggplot(combined, aes(x = reorder(Category, -Count), y = Count, fill = Source)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = scales::comma(Count)), 
            position = position_dodge(width = 0.9), 
            vjust = -0.5, size = 4) +
  scale_fill_manual(values = c("Housing Violations" = "#FF6B6B", 
                                "311 Complaints" = "#4ECDC4")) +
  scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Violations vs 311 Complaints by Category",
    subtitle = "Manhattan, 2022-2025 - Comparable Categories Only",
    x = "Category",
    y = "Count",
    fill = "Source"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )